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INSERT INTO RilDF (token, U 
SELECT T. token, LOG (S . size) -LOG (COUNT (UNIQUE (*) ) ) 
FROM RiTokens T, RiSize S 
GROUP BY T. token, S.size 
(a) Relation with token id/ counts 

INSERT INTO RiLength(tid, len) 

SELECT T.tid, SQRT(SUH(I.itf *I.idf*T.tf *T.tf )) 

FROM RilDF I, RiTF T 

WHERE I .token » T. token 

GROUP BY T.tid 

(c) Relation with weight- vector lengths ^ 



INSERT INTO RiSum (token, total) 

SELECT R. token, SUH(R. weight) 

FROM RiVeights R 

GROUP BY R. token 

(e) Relation with total token weights 



INSERT INTO RiTrUid, token, tf) 

SELECT T.tid, T. token, C0UNT(*) 

FROM RiTokens T 

GROUP BY T.tid, T. token 

(b) Relation with token tf counts 

INSERT INTO RiVeights (t id, token, weight) 
SELECT T-tid, T. token, I.idf ♦T.tf/L.len 
FROM RilDF I, RiTF T, RiLength L 
WHERE I. token « T. token AND T.tid * L.tid 

(d) Final relation with normalized tuple 
weight vectors 

INSERT INTO RiSize (size) 
SELECT COUNT (*) 
FROM Ri 

(f) Dummy relation used to create RilDF 



Fi 
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SECecT .rlw.tid AS tidl, r2v.tid AS tidis 

FROM Rltfeights rlw, R2tfeights r2w 

WHERE rlv. token ■ r2v. token 

GROUP BY rlv.tid, r2w.tid 

HAVING SUH(riw.weight*r2w. weight)> <f> 





Title: Text Joins for Data Cleansing and Integration in a Relational Database Management System 

Applicants: Koudas et al. 
Docket No.: 1209-29 



3/12 



J* 



SELECT rw.tid, rw. token, rv.weight/rs.totaj. 'AS P 
FROM RiWeights rw, RiSum rs 
WHERE rw. token = rs. token 



he, 3 
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INSERT INTO RiSample(t id, token, c) 

SELECT rw.tid, rw. token, R0UND(S * rw. we ight/rs. total., 0) AS c 
FROM RiWeights rw,. RiSum rs 
WHERE rw. token » rs. token 
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SELECT rlw.tid AS tidi, r2s.tid AS tid2 

m . — 

FROM. Riweights rlw, R2sample r2s, R2suin r2sum, RlV rlv 
HAVING SUH(rlw. weight ♦ r2sum. total / rlv;Tv) > S * <f>> / rlv.Tv 
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SELECT tidl, tid2 ^ 
FROM 

C' 

SELECT rlw.tid AS tidl, r2s.tid AS tid2, SUM (rlw. weight * r2sum.total) AS Ci 
FROM Rlweights rlw, R2sample r2s, R2suio r2snm 

WHERE rlw. token » r2s. token AND rlw. token = r2sum. token AND rlw.tid = rlv.tid 
GROUP BY rlw.tid, r2s.tid 



SELECT rls.tid AS tidl, r2w.tid AS tid2, SUM (r2w. weight * risum.total) AS Ci 
FROM R2weights r2w, Risample rls, Rlsum rlsnm 

WHERE r2w. token = rls. token AND r2w. token * rlsum. token AND r2w.tid = r2v.tid 
GROUP BY r2w.tid, rls.tid 
) SYM 

GROUP BY tidl, tid2 
HAVING AVG(Ci) > S*<fi* 



UNION ALL 
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SELECT ris.tid AS tidl, r2s.tid AS tid2 

FROM RISample rls, R2Sample r2s, RiSua rlsum, R2Snm r2sum 

WHERE ris. token = rlsum. token AND R2Sample. token * r2sum. token AND r Is, token = r2s. token 
GROUP BY rls.tid, r2s,tid 

HAVING S(W(rl»nm. total * r2sum. total) > 5* 5 * $ 
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S 




(b) Q-grams for q = 2 




(c) Q-grams for $ = 3 
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